import re
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
plt.style.use('default')
pd.set_option('display.max_column', None)
pd.set_option('display.width',None)
who_df = pd.read_csv(r'C:\Users\eyinl\OneDrive\Desktop\Python and ML\files\Assignment Data\Life Expectancy Data.csv')
who_df.head(10)
who_df.shape
who_df.info()
who_df.describe()
who_df.describe(include = 'all')
who_df.duplicated().sum()
##Using regular expression to find and rename columns that begin and end with spaces
column = who_df.columns.to_list()
column
for i in range(len(column)):
if re.search("^\s", column[i]):
column[i] = re.sub("^\s",'', column[i])
if re.search("\s$", column[i]):
column[i] = re.sub("\s$",'', column[i])
column
who_df.columns = column
who_df.columns.to_list()
who_df.isnull().sum()
for x in who_df.columns:
if who_df[x].isnull().sum()> 0:
print(x)
imputer = SimpleImputer(missing_values = np.nan, strategy = 'median', fill_value = None)
for column in who_df.columns:
if who_df[column].isnull().sum()> 0:
who_df[column] = imputer.fit_transform(who_df[[column]])
##By using for loop, we can look at the columns with at least one empty (NA) record
##Then we use the SimpleImputer method to replace them with the median value
who_df.isnull().sum()
##Preparing the Correlation Matrix
correlation_matrix = who_df.corr()
correlation_matrix
##We can use Heatmap to graphically show the correlation
fig = plt.figure(figsize = (16, 8))
sns.heatmap(correlation_matrix, annot=True, fmt = '.2f', cmap = 'Greens')
plt.show()
Clearly, some of the predicting factors which has been chosen initially really affect the Life expectancy. Some of the predicting variables that are actually affecting the life expectancy are "Adult Mortality", infant mortality, HIV/Aids, BMI "Income composition of resources", and "Schooling". Also, there is a correlation of 1 between the columns "under-five deaths" and "infant deaths", so we will drop one of them. This will enable us deal with Multicollinearity.
## who_df.drop('infant deaths', axis = 1, inplace = True)
who_df.columns
who_df.shape
who_df['Status'].unique()
fig = px.violin(who_df, x = who_df['Status'], y = who_df['Life expectancy'],box= True, color = 'Status', template = 'plotly_dark', title='<b> Life Expectancy based on Country Status')
fig.show()
This implies that people in developed countries have high life expectancy when compared to people in developing countries.
fig = px.scatter(who_df, x=who_df['Life expectancy'], y = who_df['Income composition of resources'], color='Country',\
size ='Life expectancy', template='plotly_dark', title = 'Income composition of Resources vs Life expectancy')
fig.show()
Here we can see that the Life expectancy increases as the income composition of resources increases
fig = px.scatter(who_df.sort_values(by='Year'), x='Life expectancy', y='Schooling', size = 'Life expectancy', color='Country', \
template='plotly_dark',animation_frame='Year', animation_group='Country', title='Schooling vs Life Expectancy for each Year')
fig.show()
Here, we can say that Life expectancy increases as the schooling rate increases
fig = px.scatter(who_df, x=who_df['Life expectancy'], y = who_df['GDP'], color='Country',\
size ='Life expectancy', template='plotly_dark', title = 'GDP vs Life expectancy')
fig.show()
Here, an overall increase in the GDP of a country corresponds to an increase in their Life expectancy
fig = px.scatter(who_df.sort_values(by = 'Year'), x='Life expectancy', y= 'GDP', animation_frame='Year',\
animation_group='Country', color='Country', template='plotly_dark', size = 'Life expectancy', title='<b>Life Expectancy Vs GDP of Countries in each Year')
fig.show()
Here, yearly increase in the GDP of a country corresponds to an increase in their yearly Life expectancy
fig = px.scatter(who_df, x = 'Life expectancy', y = who_df['HIV/AIDS'], color='Country', size='Life expectancy', \
template='plotly_dark', title='<b> HIV/Aids vs Life expectancy')
fig.show()
Here, increase in the prevalnce of HIV/Aids corresponds to a drastic decrease in Life expectancy
fig = px.scatter(who_df, x = 'Life expectancy', y ='percentage expenditure', color='Country', size='Life expectancy', \
template='plotly_dark', title='<b> Percentage Expenditure vs Life expectancy')
fig.show()
Clearly, a country having a lower life expectancy value less than 65 can improve its average lifespan by increasing its healthcare expenditure.
fig = px.scatter (who_df, x ='Life expectancy', y = 'Adult Mortality', color = who_df['Country'], size = who_df['Life expectancy'],opacity= 0.6, template = 'plotly_dark', title = '<b> Adult Mortality vs Life Expectancy')
fig.show()
Here we observe that an increase in the Adult Mortality rate would greatly reduce the average Life expectancy of a country.
fig = px.scatter(who_df, x='Life expectancy', y='infant deaths', size='Life expectancy', color = 'Country', \
template='plotly_dark', title='Infant Mortality vs Life expectancy')
fig.show()
Here, increase in the Infant mortality rate corresponds to a slight decrease in Life expectancy
fig = px.scatter(who_df, x='Life expectancy', y='Alcohol', color='Country', size='Life expectancy', template='plotly_dark', \
title='Alcohol vs Life expectancy')
fig.show()
Here, as the consumption of alcohol increases, there is a slight increase on the life expectancy rate, which is a positive correlation
fig = px.scatter(who_df, x='Life expectancy', y='Population', size='Life expectancy', color='Country', \
template='plotly_dark', title='Population vs Life Expectancy')
fig.show()
Here, we can conclude that it is true that densely populated countries tend to have lower life expectancy